suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)

#figdir <- paste0(wd, 'Figures/DRS_m3C_sites/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/mRNAs/')
fastadir <- paste0(wd, 'Fasta/DRS/Kmer_range/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

paste_wd <- function(path) {
  
  paste0(wd, path)
  
}

Read data

RNA sequence of m3C RNAs

espresso_AsPC1_transcriptome_seqs <- 
  read_tsv(
    'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
  )
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_transcriptome_seqs
## # A tibble: 36,717 × 3
##    transcript_id      transcript_seq                           transcript_length
##    <chr>              <chr>                                                <dbl>
##  1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA…               987
##  2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA…              2252
##  3 ENST00000420393.5  CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGG…               854
##  4 ENST00000698415.1  GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTC…              6597
##  5 ENST00000698416.1  CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTC…              5500
##  6 ENST00000488263.5  AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTT…              4528
##  7 ENST00000424814.5  GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACC…              2038
##  8 ENST00000231948.9  AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCA…              2187
##  9 ENST00000432408.6  GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAG…              2203
## 10 ENST00000459840.5  ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTG…               723
## # ℹ 36,707 more rows

m3C positions

DRS_methylated_positions_CDSpos <- 
  read_tsv(
    'Tables/DRS_m3C_sites/Metagene_CDS/DRS_methylated_positions_CDSpos_2024-06-05.tsv' |> 
      paste_wd()
  ) |> 
  left_join(espresso_AsPC1_transcriptome_seqs) 
## Rows: 436 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (7): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2,...
## dbl (11): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kme...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
DRS_methylated_positions_CDSpos
## # A tibble: 436 × 20
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCA            33       37
##  5 ENST00000361390.2 MT-ND1    chrM    protein_cod… CCCCT           123      127
##  6 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCT           141      145
##  7 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCG           186      190
##  8 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCT           205      209
##  9 ENST00000361390.2 MT-ND1    chrM    protein_cod… CCCCC           260      264
## 10 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCTC           322      326
## # ℹ 426 more rows
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## #   kmer_region <chr>, transcript_seq <chr>, transcript_length <dbl>

Get neibor sequences of the m3C sites (mRNA and MT-mRNAs)

DRS_methylated_positions_CDSpos_neiborseq <- 
  DRS_methylated_positions_CDSpos |> 
  mutate(
    neibor_seq = str_sub(transcript_seq, kmer_start - 5, kmer_end + 5)
  ) |> 
  select(transcript_id, kmer_middle, ref_kmer, neibor_seq, genetype2, kmer_region) |>
  mutate(name = paste(transcript_id, kmer_middle, genetype2, kmer_region, sep = '|'))
DRS_methylated_positions_CDSpos_neiborseq |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/mRNAs/DRS_methylated_positions_CDSpos_neiborseq_2024-06-10.tsv.gz
## # A tibble: 436 × 7
##    transcript_id     kmer_middle ref_kmer neibor_seq genetype2 kmer_region name 
##    <chr>                   <dbl> <chr>    <chr>      <chr>     <chr>       <chr>
##  1 ENST00000429711.7         425 GCCCA    GAGCTGCCC… mRNA      CDS         ENST…
##  2 ENST00000647248.2         383 ACCCC    GCTGTACCC… mRNA      CDS         ENST…
##  3 ENST00000647248.2         384 CCCCT    CTGTACCCC… mRNA      CDS         ENST…
##  4 ENST00000361390.2          35 ACCCA    ATTGTACCC… mt-mRNA   CDS         ENST…
##  5 ENST00000361390.2         125 CCCCT    GTAGGCCCC… mt-mRNA   CDS         ENST…
##  6 ENST00000361390.2         143 ACCCT    CTACAACCC… mt-mRNA   CDS         ENST…
##  7 ENST00000361390.2         188 ACCCG    CTAAAACCC… mt-mRNA   CDS         ENST…
##  8 ENST00000361390.2         207 ACCCT    CCATCACCC… mt-mRNA   CDS         ENST…
##  9 ENST00000361390.2         262 CCCCC    ATGAACCCC… mt-mRNA   CDS         ENST…
## 10 ENST00000361390.2         324 ACCTC    TAGCCACCT… mt-mRNA   CDS         ENST…
## # ℹ 426 more rows

Export fasta

mRNA

for (region in unique(DRS_methylated_positions_CDSpos_neiborseq$kmer_region)) {
  
  print(region)
  fasta_basename <- paste0('DRS_methylated_positions_mRNAs_neiborseq_', region)
  
  filtered_df <- 
    DRS_methylated_positions_CDSpos_neiborseq |> 
    filter(genetype2 == 'mRNA') |> 
    filter(kmer_region == region) 
  print(filtered_df)
  
  filtered_df |> 
    export_as_fasta(
      name = name, sequence = neibor_seq, 
      fasta_basename = fasta_basename, outdir = fastadir
    )
  
}
## [1] "CDS"
## # A tibble: 179 × 7
##    transcript_id     kmer_middle ref_kmer neibor_seq genetype2 kmer_region name 
##    <chr>                   <dbl> <chr>    <chr>      <chr>     <chr>       <chr>
##  1 ENST00000429711.7         425 GCCCA    GAGCTGCCC… mRNA      CDS         ENST…
##  2 ENST00000647248.2         383 ACCCC    GCTGTACCC… mRNA      CDS         ENST…
##  3 ENST00000647248.2         384 CCCCT    CTGTACCCC… mRNA      CDS         ENST…
##  4 ENST00000215754.8         182 GCCAC    CGCAGGCCA… mRNA      CDS         ENST…
##  5 ENST00000215754.8         193 GCCCC    GGCAAGCCC… mRNA      CDS         ENST…
##  6 ENST00000270625.7         163 ACCCA    AAGACACCC… mRNA      CDS         ENST…
##  7 ENST00000270625.7         475 CACCA    GCCGGCACC… mRNA      CDS         ENST…
##  8 ENST00000331825.…         347 GCCAC    CGTGAGCCA… mRNA      CDS         ENST…
##  9 ENST00000331825.…         395 GTCTC    CGAGCGTCT… mRNA      CDS         ENST…
## 10 ENST00000331825.…         487 GCCAT    CAGACGCCA… mRNA      CDS         ENST…
## # ℹ 169 more rows
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS/Kmer_range/DRS_methylated_positions_mRNAs_neiborseq_CDS.fasta
## 
## [1] "fiveprimeUTR"
## # A tibble: 30 × 7
##    transcript_id     kmer_middle ref_kmer neibor_seq genetype2 kmer_region name 
##    <chr>                   <dbl> <chr>    <chr>      <chr>     <chr>       <chr>
##  1 ENST00000215754.8          81 GTCCT    TCCTGGTCC… mRNA      fiveprimeU… ENST…
##  2 ENST00000331825.…          23 GTCTG    CGCGGGTCT… mRNA      fiveprimeU… ENST…
##  3 ENST00000331825.…         124 ACCAT    CCGGGACCA… mRNA      fiveprimeU… ENST…
##  4 ENST00000331825.…         195 ACCAA    TGCCAACCA… mRNA      fiveprimeU… ENST…
##  5 ENST00000501597.3          60 GCCAT    TTAGCGCCA… mRNA      fiveprimeU… ENST…
##  6 ENST00000501597.3          83 GCCAT    TCTGCGCCA… mRNA      fiveprimeU… ENST…
##  7 ENST00000392514.9          61 TCCCT    GGCAATCCC… mRNA      fiveprimeU… ENST…
##  8 ENST00000321153.9          46 CTCCG    GACTTCTCC… mRNA      fiveprimeU… ENST…
##  9 ENST00000273550.…          62 ACCCG    ACGGAACCC… mRNA      fiveprimeU… ENST…
## 10 ENST00000273550.…         104 GCCCT    AGCCAGCCC… mRNA      fiveprimeU… ENST…
## # ℹ 20 more rows
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS/Kmer_range/DRS_methylated_positions_mRNAs_neiborseq_fiveprimeUTR.fasta
## 
## [1] "threeprimeUTR"
## # A tibble: 45 × 7
##    transcript_id     kmer_middle ref_kmer neibor_seq genetype2 kmer_region name 
##    <chr>                   <dbl> <chr>    <chr>      <chr>     <chr>       <chr>
##  1 ENST00000215754.8         486 ACCCG    CGGGAACCC… mRNA      threeprime… ENST…
##  2 ENST00000199764.7        1700 TTCAG    TCCTTTTCA… mRNA      threeprime… ENST…
##  3 ENST00000552551.5        1981 ACCCA    AGGAGACCC… mRNA      threeprime… ENST…
##  4 ENST00000552551.5        2010 GCCCA    CCTCAGCCC… mRNA      threeprime… ENST…
##  5 ENST00000501597.3         260 GTCTA    CTACTGTCT… mRNA      threeprime… ENST…
##  6 ENST00000501597.3         281 ATCTA    AATGGATCT… mRNA      threeprime… ENST…
##  7 ENST00000501597.3         296 GCCCT    TCATCGCCC… mRNA      threeprime… ENST…
##  8 ENST00000501597.3         314 ACCTC    CGATCACCT… mRNA      threeprime… ENST…
##  9 ENST00000501597.3         323 ACCCA    CTGAGACCC… mRNA      threeprime… ENST…
## 10 ENST00000501597.3         371 ACCTG    CCTGGACCT… mRNA      threeprime… ENST…
## # ℹ 35 more rows
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS/Kmer_range/DRS_methylated_positions_mRNAs_neiborseq_threeprimeUTR.fasta